<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - April 23, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>April 23, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Survey of Video Diffusion Models: Foundations, Implementations, and Applications</h2>
            <p class="paper-summary">Recent advances in diffusion models have revolutionized video generation,
offering superior temporal consistency and visual quality compared to
traditional generative adversarial networks-based approaches. While this
emerging field shows tremendous promise in applications, it faces significant
challenges in motion consistency, computational efficiency, and ethical
considerations. This survey provides a comprehensive review of diffusion-based
video generation, examining its evolution, technical foundations, and practical
applications. We present a systematic taxonomy of current methodologies,
analyze architectural innovations and optimization strategies, and investigate
applications across low-level vision tasks such as denoising and
super-resolution. Additionally, we explore the synergies between diffusionbased
video generation and related domains, including video representation learning,
question answering, and retrieval. Compared to the existing surveys (Lei et
al., 2024a;b; Melnik et al., 2024; Cao et al., 2023; Xing et al., 2024c) which
focus on specific aspects of video generation, such as human video synthesis
(Lei et al., 2024a) or long-form content generation (Lei et al., 2024b), our
work provides a broader, more updated, and more fine-grained perspective on
diffusion-based approaches with a special section for evaluation metrics,
industry solutions, and training engineering techniques in video generation.
This survey serves as a foundational resource for researchers and practitioners
working at the intersection of diffusion models and video generation, providing
insights into both the theoretical frameworks and practical implementations
that drive this rapidly evolving field. A structured list of related works
involved in this survey is also available on
https://github.com/Eyeline-Research/Survey-Video-Diffusion.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this survey paper provides a comprehensive overview of video diffusion models, covering their foundations, implementations, and applications, while also addressing challenges and ethical considerations. it distinguishes itself from existing surveys by offering a broader and more fine-grained perspective, with a special focus on evaluation metrics, industry solutions, and training engineering techniques.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 这篇综述论文全面概述了视频扩散模型，涵盖了其基础、实现和应用，同时讨论了挑战和伦理考量。它通过提供更广泛、更细致的视角，并特别关注评估指标、行业解决方案和训练工程技术，从而区别于现有的同类综述。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(10/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(9/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16081v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yimu Wang, Xuye Liu, Wei Pang, Li Ma, Shuai Yuan, Paul Debevec, Ning Yu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">From Reflection to Perfection: Scaling Inference-Time Optimization for Text-to-Image Diffusion Models via Reflection Tuning</h2>
            <p class="paper-summary">Recent text-to-image diffusion models achieve impressive visual quality
through extensive scaling of training data and model parameters, yet they often
struggle with complex scenes and fine-grained details. Inspired by the
self-reflection capabilities emergent in large language models, we propose
ReflectionFlow, an inference-time framework enabling diffusion models to
iteratively reflect upon and refine their outputs. ReflectionFlow introduces
three complementary inference-time scaling axes: (1) noise-level scaling to
optimize latent initialization; (2) prompt-level scaling for precise semantic
guidance; and most notably, (3) reflection-level scaling, which explicitly
provides actionable reflections to iteratively assess and correct previous
generations. To facilitate reflection-level scaling, we construct GenRef, a
large-scale dataset comprising 1 million triplets, each containing a
reflection, a flawed image, and an enhanced image. Leveraging this dataset, we
efficiently perform reflection tuning on state-of-the-art diffusion
transformer, FLUX.1-dev, by jointly modeling multimodal inputs within a unified
framework. Experimental results show that ReflectionFlow significantly
outperforms naive noise-level scaling methods, offering a scalable and
compute-efficient solution toward higher-quality image synthesis on challenging
tasks.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces reflectionflow, an inference-time optimization framework for text-to-image diffusion models, leveraging self-reflection and a tuned diffusion transformer on a large-scale dataset to enhance image quality, especially for complex scenes.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了reflectionflow，一个用于文本到图像扩散模型的推理时优化框架，它利用自反思和一个在大型数据集上调整的扩散转换器来提高图像质量，尤其是在复杂场景中。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16080v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Le Zhuo, Liangbing Zhao, Sayak Paul, Yue Liao, Renrui Zhang, Yi Xin, Peng Gao, Mohamed Elhoseiny, Hongsheng Li</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Describe Anything: Detailed Localized Image and Video Captioning</h2>
            <p class="paper-summary">Generating detailed and accurate descriptions for specific regions in images
and videos remains a fundamental challenge for vision-language models. We
introduce the Describe Anything Model (DAM), a model designed for detailed
localized captioning (DLC). DAM preserves both local details and global context
through two key innovations: a focal prompt, which ensures high-resolution
encoding of targeted regions, and a localized vision backbone, which integrates
precise localization with its broader context. To tackle the scarcity of
high-quality DLC data, we propose a Semi-supervised learning (SSL)-based Data
Pipeline (DLC-SDP). DLC-SDP starts with existing segmentation datasets and
expands to unlabeled web images using SSL. We introduce DLC-Bench, a benchmark
designed to evaluate DLC without relying on reference captions. DAM sets new
state-of-the-art on 7 benchmarks spanning keyword-level, phrase-level, and
detailed multi-sentence localized image and video captioning.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces the describe anything model (dam) for detailed localized image and video captioning, utilizing a focal prompt and localized vision backbone, and a semi-supervised learning pipeline (dlc-sdp) to address data scarcity. dam achieves state-of-the-art results on multiple benchmarks.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 describe anything model (dam)，用于生成图像和视频的详细局部描述，利用焦点提示和局部视觉骨干网络，以及半监督学习管道 (dlc-sdp) 来解决数据稀缺问题。dam 在多个基准测试中取得了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16072v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Long Lian, Yifan Ding, Yunhao Ge, Sifei Liu, Hanzi Mao, Boyi Li, Marco Pavone, Ming-Yu Liu, Trevor Darrell, Adam Yala, Yin Cui</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Boosting Generative Image Modeling via Joint Image-Feature Synthesis</h2>
            <p class="paper-summary">Latent diffusion models (LDMs) dominate high-quality image generation, yet
integrating representation learning with generative modeling remains a
challenge. We introduce a novel generative image modeling framework that
seamlessly bridges this gap by leveraging a diffusion model to jointly model
low-level image latents (from a variational autoencoder) and high-level
semantic features (from a pretrained self-supervised encoder like DINO). Our
latent-semantic diffusion approach learns to generate coherent image-feature
pairs from pure noise, significantly enhancing both generative quality and
training efficiency, all while requiring only minimal modifications to standard
Diffusion Transformer architectures. By eliminating the need for complex
distillation objectives, our unified design simplifies training and unlocks a
powerful new inference strategy: Representation Guidance, which leverages
learned semantics to steer and refine image generation. Evaluated in both
conditional and unconditional settings, our method delivers substantial
improvements in image quality and training convergence speed, establishing a
new direction for representation-aware generative modeling.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a latent-semantic diffusion model that jointly models image latents and semantic features using a diffusion model to improve image generation quality and training efficiency. it uses representation guidance for image refinement.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种潜在语义扩散模型，该模型使用扩散模型联合建模图像潜在变量和语义特征，以提高图像生成质量和训练效率。它使用表征引导进行图像细化。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16064v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Theodoros Kouzelis, Efstathios Karypidis, Ioannis Kakogeorgiou, Spyros Gidaris, Nikos Komodakis</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Efficient Temporal Consistency in Diffusion-Based Video Editing with Adaptor Modules: A Theoretical Framework</h2>
            <p class="paper-summary">Adapter-based methods are commonly used to enhance model performance with
minimal additional complexity, especially in video editing tasks that require
frame-to-frame consistency. By inserting small, learnable modules into
pretrained diffusion models, these adapters can maintain temporal coherence
without extensive retraining. Approaches that incorporate prompt learning with
both shared and frame-specific tokens are particularly effective in preserving
continuity across frames at low training cost. In this work, we want to provide
a general theoretical framework for adapters that maintain frame consistency in
DDIM-based models under a temporal consistency loss. First, we prove that the
temporal consistency objective is differentiable under bounded feature norms,
and we establish a Lipschitz bound on its gradient. Second, we show that
gradient descent on this objective decreases the loss monotonically and
converges to a local minimum if the learning rate is within an appropriate
range. Finally, we analyze the stability of modules in the DDIM inversion
procedure, showing that the associated error remains controlled. These
theoretical findings will reinforce the reliability of diffusion-based video
editing methods that rely on adapter strategies and provide theoretical
insights in video generation tasks.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper provides a theoretical framework for adapter-based diffusion models that maintain temporal consistency in video editing, proving differentiability, convergence, and stability.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文为基于适配器的扩散模型提供了一个理论框架，该框架在视频编辑中保持时间一致性，证明了可微性、收敛性和稳定性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16016v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xinyuan Song, Yangfan He, Sida Li, Jianhui Wang, Hongyang He, Xinhang Yuan, Ruoyu Wang, Jiaqi Chen, Keqin Li, Kuan Lu, Menghao Huo, Binxu Li, Pei Liu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">FreeGraftor: Training-Free Cross-Image Feature Grafting for Subject-Driven Text-to-Image Generation</h2>
            <p class="paper-summary">Subject-driven image generation aims to synthesize novel scenes that
faithfully preserve subject identity from reference images while adhering to
textual guidance, yet existing methods struggle with a critical trade-off
between fidelity and efficiency. Tuning-based approaches rely on time-consuming
and resource-intensive subject-specific optimization, while zero-shot methods
fail to maintain adequate subject consistency. In this work, we propose
FreeGraftor, a training-free framework that addresses these limitations through
cross-image feature grafting. Specifically, FreeGraftor employs semantic
matching and position-constrained attention fusion to transfer visual details
from reference subjects to the generated image. Additionally, our framework
incorporates a novel noise initialization strategy to preserve geometry priors
of reference subjects for robust feature matching. Extensive qualitative and
quantitative experiments demonstrate that our method enables precise subject
identity transfer while maintaining text-aligned scene synthesis. Without
requiring model fine-tuning or additional training, FreeGraftor significantly
outperforms existing zero-shot and training-free approaches in both subject
fidelity and text alignment. Furthermore, our framework can seamlessly extend
to multi-subject generation, making it practical for real-world deployment. Our
code is available at https://github.com/Nihukat/FreeGraftor.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: freegraftor is a training-free method for subject-driven text-to-image generation that uses cross-image feature grafting to improve subject fidelity and text alignment without fine-tuning. it also preserves the geometry prior of the reference subject.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: freegraftor是一种无需训练的主题驱动文本到图像生成方法，它使用跨图像特征嫁接来提高主题保真度和文本对齐，而无需微调。它还保留了参考对象的几何先验。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15958v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zebin Yao, Lei Ren, Huixing Jiang, Chen Wei, Xiaojie Wang, Ruifan Li, Fangxiang Feng</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Reasoning Physical Video Generation with Diffusion Timestep Tokens via Reinforcement Learning</h2>
            <p class="paper-summary">Despite recent progress in video generation, producing videos that adhere to
physical laws remains a significant challenge. Traditional diffusion-based
methods struggle to extrapolate to unseen physical conditions (eg, velocity)
due to their reliance on data-driven approximations. To address this, we
propose to integrate symbolic reasoning and reinforcement learning to enforce
physical consistency in video generation. We first introduce the Diffusion
Timestep Tokenizer (DDT), which learns discrete, recursive visual tokens by
recovering visual attributes lost during the diffusion process. The recursive
visual tokens enable symbolic reasoning by a large language model. Based on it,
we propose the Phys-AR framework, which consists of two stages: The first stage
uses supervised fine-tuning to transfer symbolic knowledge, while the second
stage applies reinforcement learning to optimize the model's reasoning
abilities through reward functions based on physical conditions. Our approach
allows the model to dynamically adjust and improve the physical properties of
generated videos, ensuring adherence to physical laws. Experimental results
demonstrate that PhysAR can generate videos that are physically consistent.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces phys-ar, a framework combining diffusion models, symbolic reasoning (via llms), and reinforcement learning to generate physically consistent videos. it uses diffusion timestep tokens (ddt) to incorporate attributes and improve on data-driven limitations of diffusion models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为phys-ar的框架，它结合了扩散模型、符号推理（通过llm）和强化学习来生成符合物理规律的视频。它使用扩散时间步长令牌（ddt）来整合属性，并改进了扩散模型在数据驱动方面的局限性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15932v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Wang Lin, Liyu Jia, Wentao Hu, Kaihang Pan, Zhongqi Yue, Wei Zhao, Jingyuan Chen, Fei Wu, Hanwang Zhang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Satellite to GroundScape -- Large-scale Consistent Ground View Generation from Satellite Views</h2>
            <p class="paper-summary">Generating consistent ground-view images from satellite imagery is
challenging, primarily due to the large discrepancies in viewing angles and
resolution between satellite and ground-level domains. Previous efforts mainly
concentrated on single-view generation, often resulting in inconsistencies
across neighboring ground views. In this work, we propose a novel cross-view
synthesis approach designed to overcome these challenges by ensuring
consistency across ground-view images generated from satellite views. Our
method, based on a fixed latent diffusion model, introduces two conditioning
modules: satellite-guided denoising, which extracts high-level scene layout to
guide the denoising process, and satellite-temporal denoising, which captures
camera motion to maintain consistency across multiple generated views. We
further contribute a large-scale satellite-ground dataset containing over
100,000 perspective pairs to facilitate extensive ground scene or video
generation. Experimental results demonstrate that our approach outperforms
existing methods on perceptual and temporal metrics, achieving high
photorealism and consistency in multi-view outputs.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a novel cross-view synthesis approach using a diffusion model with satellite-guided and satellite-temporal denoising to generate consistent ground-view images from satellite imagery, accompanied by a new large-scale dataset.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了一种新颖的跨视角合成方法，使用扩散模型和卫星引导及卫星时间去噪技术，从卫星图像生成一致的地面图像，并伴随一个新的大规模数据集。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15786v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ningli Xu, Rongjun Qin</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.4, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Structure-Preserving Zero-Shot Image Editing via Stage-Wise Latent Injection in Diffusion Models</h2>
            <p class="paper-summary">We propose a diffusion-based framework for zero-shot image editing that
unifies text-guided and reference-guided approaches without requiring
fine-tuning. Our method leverages diffusion inversion and timestep-specific
null-text embeddings to preserve the structural integrity of the source image.
By introducing a stage-wise latent injection strategy-shape injection in early
steps and attribute injection in later steps-we enable precise, fine-grained
modifications while maintaining global consistency. Cross-attention with
reference latents facilitates semantic alignment between the source and
reference. Extensive experiments across expression transfer, texture
transformation, and style infusion demonstrate state-of-the-art performance,
confirming the method's scalability and adaptability to diverse image editing
scenarios.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents a diffusion-based zero-shot image editing framework that combines text-guided and reference-guided approaches using stage-wise latent injection to preserve structural integrity while modifying image attributes.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种基于扩散模型的零样本图像编辑框架，该框架结合了文本引导和参考引导方法，通过阶段式潜在注入来保持结构完整性，同时修改图像属性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15723v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Dasol Jeong, Donggoo Kang, Jiwon Park, Hyebean Lee, Joonki Paik</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.45, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Vidi: Large Multimodal Models for Video Understanding and Editing</h2>
            <p class="paper-summary">Humans naturally share information with those they are connected to, and
video has become one of the dominant mediums for communication and expression
on the Internet. To support the creation of high-quality large-scale video
content, a modern pipeline requires a comprehensive understanding of both the
raw input materials (e.g., the unedited footage captured by cameras) and the
editing components (e.g., visual effects). In video editing scenarios, models
must process multiple modalities (e.g., vision, audio, text) with strong
background knowledge and handle flexible input lengths (e.g., hour-long raw
videos), which poses significant challenges for traditional models. In this
report, we introduce Vidi, a family of Large Multimodal Models (LMMs) for a
wide range of video understand editing scenarios. The first release focuses on
temporal retrieval, i.e., identifying the time ranges within the input videos
corresponding to a given text query, which plays a critical role in intelligent
editing. The model is capable of processing hour-long videos with strong
temporal understanding capability, e.g., retrieve time ranges for certain
queries. To support a comprehensive evaluation in real-world scenarios, we also
present the VUE-TR benchmark, which introduces five key advancements. 1) Video
duration: significantly longer than videos of existing temporal retrival
datasets, 2) Audio support: includes audio-based queries, 3) Query format:
diverse query lengths/formats, 4) Annotation quality: ground-truth time ranges
are manually annotated. 5) Evaluation metric: a refined IoU metric to support
evaluation over multiple time ranges. Remarkably, Vidi significantly
outperforms leading proprietary models, e.g., GPT-4o and Gemini, on the
temporal retrieval task, indicating its superiority in video editing scenarios.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces vidi, a family of large multimodal models (lmms) designed for video understanding and editing, specifically addressing temporal retrieval in long videos and demonstrating superior performance compared to existing models like gpt-4o and gemini. it also introduces vue-tr, a new benchmark for video temporal retrieval.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了vidi，一种用于视频理解和编辑的大型多模态模型（lmms），专门解决长视频中的时序检索问题，并展示了优于现有模型（如gpt-4o和gemini）的性能。同时，还介绍了一个新的视频时序检索基准 vue-tr。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15681v2" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Vidi Team, Celong Liu, Chia-Wen Kuo, Dawei Du, Fan Chen, Guang Chen, Jiamin Yuan, Lingxi Zhang, Lu Guo, Lusha Li, Longyin Wen, Qingyu Chen, Rachel Deng, Sijie Zhu, Stuart Siew, Tong Jin, Wei Lu, Wen Zhong, Xiaohui Shen, Xin Gu, Xing Mei, Xueqiong Qu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.5, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">DiTPainter: Efficient Video Inpainting with Diffusion Transformers</h2>
            <p class="paper-summary">Many existing video inpainting algorithms utilize optical flows to construct
the corresponding maps and then propagate pixels from adjacent frames to
missing areas by mapping. Despite the effectiveness of the propagation
mechanism, they might encounter blurry and inconsistencies when dealing with
inaccurate optical flows or large masks. Recently, Diffusion Transformer (DiT)
has emerged as a revolutionary technique for video generation tasks. However,
pretrained DiT models for video generation all contain a large amount of
parameters, which makes it very time consuming to apply to video inpainting
tasks. In this paper, we present DiTPainter, an end-to-end video inpainting
model based on Diffusion Transformer (DiT). DiTPainter uses an efficient
transformer network designed for video inpainting, which is trained from
scratch instead of initializing from any large pretrained models. DiTPainter
can address videos with arbitrary lengths and can be applied to video
decaptioning and video completion tasks with an acceptable time cost.
Experiments show that DiTPainter outperforms existing video inpainting
algorithms with higher quality and better spatial-temporal consistency.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: ditpainter is a new diffusion transformer based video inpainting model that's trained from scratch, achieving improved quality and consistency compared to existing methods while addressing limitations of optical flow-based approaches.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: ditpainter是一种新的基于diffusion transformer的视频修复模型，该模型从头开始训练，与现有方法相比，提高了质量和一致性，同时解决了基于光流方法存在的局限性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15661v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xian Wu, Chang Liu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.55, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">AdaViP: Aligning Multi-modal LLMs via Adaptive Vision-enhanced Preference Optimization</h2>
            <p class="paper-summary">Preference alignment through Direct Preference Optimization (DPO) has
demonstrated significant effectiveness in aligning multimodal large language
models (MLLMs) with human preferences. However, existing methods focus
primarily on language preferences while neglecting the critical visual context.
In this paper, we propose an Adaptive Vision-enhanced Preference optimization
(AdaViP) that addresses these limitations through two key innovations: (1)
vision-based preference pair construction, which integrates multiple visual
foundation models to strategically remove key visual elements from the image,
enhancing MLLMs' sensitivity to visual details; and (2) adaptive preference
optimization that dynamically balances vision- and language-based preferences
for more accurate alignment. Extensive evaluations across different benchmarks
demonstrate our effectiveness. Notably, our AdaViP-7B achieves 93.7% and 96.4%
reductions in response-level and mentioned-level hallucination respectively on
the Object HalBench, significantly outperforming current state-of-the-art
methods.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: adavip enhances multimodal llm alignment by adaptively balancing vision and language preferences and strategically removing visual elements to improve sensitivity to visual details, leading to significant reductions in object hallucination.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: adavip通过自适应地平衡视觉和语言偏好，并策略性地移除视觉元素以提高对视觉细节的敏感度，从而增强多模态llm的对齐，显著减少了对象幻觉。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15619v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jinda Lu, Jinghan Li, Yuan Gao, Junkang Wu, Jiancan Wu, Xiang Wang, Xiangnan He</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.6000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Emergence and Evolution of Interpretable Concepts in Diffusion Models</h2>
            <p class="paper-summary">Diffusion models have become the go-to method for text-to-image generation,
producing high-quality images from noise through a process called reverse
diffusion. Understanding the dynamics of the reverse diffusion process is
crucial in steering the generation and achieving high sample quality. However,
the inner workings of diffusion models is still largely a mystery due to their
black-box nature and complex, multi-step generation process. Mechanistic
Interpretability (MI) techniques, such as Sparse Autoencoders (SAEs), aim at
uncovering the operating principles of models through granular analysis of
their internal representations. These MI techniques have been successful in
understanding and steering the behavior of large language models at scale.
However, the great potential of SAEs has not yet been applied toward gaining
insight into the intricate generative process of diffusion models. In this
work, we leverage the SAE framework to probe the inner workings of a popular
text-to-image diffusion model, and uncover a variety of human-interpretable
concepts in its activations. Interestingly, we find that even before the first
reverse diffusion step is completed, the final composition of the scene can be
predicted surprisingly well by looking at the spatial distribution of activated
concepts. Moreover, going beyond correlational analysis, we show that the
discovered concepts have a causal effect on the model output and can be
leveraged to steer the generative process. We design intervention techniques
aimed at manipulating image composition and style, and demonstrate that (1) in
early stages of diffusion image composition can be effectively controlled, (2)
in the middle stages of diffusion image composition is finalized, however
stylistic interventions are effective, and (3) in the final stages of diffusion
only minor textural details are subject to change.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper applies mechanistic interpretability (mi) techniques, specifically sparse autoencoders (saes), to diffusion models, revealing interpretable concepts and demonstrating their causal effect on image generation, enabling controlled steering of the generative process.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文将机制可解释性（mi）技术，特别是稀疏自编码器（sae），应用于扩散模型，揭示了可解释的概念，并展示了它们对图像生成的因果效应，从而能够控制生成过程。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15473v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Berk Tinaz, Zalan Fabian, Mahdi Soltanolkotabi</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.65, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Manifold Induced Biases for Zero-shot and Few-shot Detection of Generated Images</h2>
            <p class="paper-summary">Distinguishing between real and AI-generated images, commonly referred to as
'image detection', presents a timely and significant challenge. Despite
extensive research in the (semi-)supervised regime, zero-shot and few-shot
solutions have only recently emerged as promising alternatives. Their main
advantage is in alleviating the ongoing data maintenance, which quickly becomes
outdated due to advances in generative technologies. We identify two main gaps:
(1) a lack of theoretical grounding for the methods, and (2) significant room
for performance improvements in zero-shot and few-shot regimes. Our approach is
founded on understanding and quantifying the biases inherent in generated
content, where we use these quantities as criteria for characterizing generated
images. Specifically, we explore the biases of the implicit probability
manifold, captured by a pre-trained diffusion model. Through score-function
analysis, we approximate the curvature, gradient, and bias towards points on
the probability manifold, establishing criteria for detection in the zero-shot
regime. We further extend our contribution to the few-shot setting by employing
a mixture-of-experts methodology. Empirical results across 20 generative models
demonstrate that our method outperforms current approaches in both zero-shot
and few-shot settings. This work advances the theoretical understanding and
practical usage of generated content biases through the lens of manifold
analysis.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a zero-shot and few-shot method for detecting ai-generated images by analyzing biases in the implicit probability manifold captured by pre-trained diffusion models, achieving superior performance compared to existing methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种零样本和少样本方法，通过分析预训练扩散模型捕获的隐式概率流形中的偏差来检测ai生成的图像，与现有方法相比，实现了更优越的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15470v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jonathan Brokman, Amit Giloni, Omer Hofman, Roman Vainshtein, Hisashi Kojima, Guy Gilboa</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.7000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Physics Driven Image Simulation from Commercial Satellite Imagery</h2>
            <p class="paper-summary">Physics driven image simulation allows for the modeling and creation of
realistic imagery beyond what is afforded by typical rendering pipelines. We
aim to automatically generate a physically realistic scene for simulation of a
given region using satellite imagery to model the scene geometry, drive
material estimates, and populate the scene with dynamic elements. We present
automated techniques to utilize satellite imagery throughout the simulated
scene to expedite scene construction and decrease manual overhead. Our
technique does not use lidar, enabling simulations that could not be
constructed previously. To develop a 3D scene, we model the various components
of the real location, addressing the terrain, modelling man-made structures,
and populating the scene with smaller elements such as vegetation and vehicles.
To create the scene we begin with a Digital Surface Model, which serves as the
basis for scene geometry, and allows us to reason about the real location in a
common 3D frame of reference. These simulated scenes can provide increased
fidelity with less manual intervention for novel locations on earth, and can
facilitate algorithm development, and processing pipelines for imagery ranging
from UV to LWIR $(200nm-20\mu m)$.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents an automated method for generating physically realistic 3d scenes from satellite imagery, without relying on lidar, for applications such as algorithm development and imagery processing across a wide spectrum of wavelengths.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种利用卫星图像自动生成物理上逼真的3d场景的方法，无需依赖激光雷达，可用于算法开发和图像处理等应用，并涵盖广泛的波长范围。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15378v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Scott Sorensen, Wayne Treible, Robert Wagner, Andrew D. Gilliam, Todd Rovito, Joseph L. Mundy</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.75, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">MMInference: Accelerating Pre-filling for Long-Context VLMs via Modality-Aware Permutation Sparse Attention</h2>
            <p class="paper-summary">The integration of long-context capabilities with visual understanding
unlocks unprecedented potential for Vision Language Models (VLMs). However, the
quadratic attention complexity during the pre-filling phase remains a
significant obstacle to real-world deployment. To overcome this limitation, we
introduce MMInference (Multimodality Million tokens Inference), a dynamic
sparse attention method that accelerates the prefilling stage for long-context
multi-modal inputs. First, our analysis reveals that the temporal and spatial
locality of video input leads to a unique sparse pattern, the Grid pattern.
Simultaneously, VLMs exhibit markedly different sparse distributions across
different modalities. We introduce a permutation-based method to leverage the
unique Grid pattern and handle modality boundary issues. By offline search the
optimal sparse patterns for each head, MMInference constructs the sparse
distribution dynamically based on the input. We also provide optimized GPU
kernels for efficient sparse computations. Notably, MMInference integrates
seamlessly into existing VLM pipelines without any model modifications or
fine-tuning. Experiments on multi-modal benchmarks-including Video QA,
Captioning, VisionNIAH, and Mixed-Modality NIAH-with state-of-the-art
long-context VLMs (LongVila, LlavaVideo, VideoChat-Flash, Qwen2.5-VL) show that
MMInference accelerates the pre-filling stage by up to 8.3x at 1M tokens while
maintaining accuracy. Our code is available at https://aka.ms/MMInference.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces mminference, a modality-aware sparse attention method that accelerates the pre-filling stage for long-context vlms by leveraging the temporal and spatial locality of video input without model modification, achieving up to 8.3x speedup at 1m tokens while maintaining accuracy.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了mminference，一种模态感知的稀疏注意力方法，通过利用视频输入的时间和空间局部性来加速长上下文vlms的预填充阶段，无需模型修改，在1m tokens的情况下实现了高达8.3倍的加速，同时保持了准确性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16083v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yucheng Li, Huiqiang Jiang, Chengruidong Zhang, Qianhui Wu, Xufang Luo, Surin Ahn, Amir H. Abdi, Dongsheng Li, Jianfeng Gao, Yuqing Yang, Lili Qiu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">ViSMaP: Unsupervised Hour-long Video Summarisation by Meta-Prompting</h2>
            <p class="paper-summary">We introduce ViSMap: Unsupervised Video Summarisation by Meta Prompting, a
system to summarise hour long videos with no-supervision. Most existing video
understanding models work well on short videos of pre-segmented events, yet
they struggle to summarise longer videos where relevant events are sparsely
distributed and not pre-segmented. Moreover, long-form video understanding
often relies on supervised hierarchical training that needs extensive
annotations which are costly, slow and prone to inconsistency. With ViSMaP we
bridge the gap between short videos (where annotated data is plentiful) and
long ones (where it's not). We rely on LLMs to create optimised
pseudo-summaries of long videos using segment descriptions from short ones.
These pseudo-summaries are used as training data for a model that generates
long-form video summaries, bypassing the need for expensive annotations of long
videos. Specifically, we adopt a meta-prompting strategy to iteratively
generate and refine creating pseudo-summaries of long videos. The strategy
leverages short clip descriptions obtained from a supervised short video model
to guide the summary. Each iteration uses three LLMs working in sequence: one
to generate the pseudo-summary from clip descriptions, another to evaluate it,
and a third to optimise the prompt of the generator. This iteration is
necessary because the quality of the pseudo-summaries is highly dependent on
the generator prompt, and varies widely among videos. We evaluate our summaries
extensively on multiple datasets; our results show that ViSMaP achieves
performance comparable to fully supervised state-of-the-art models while
generalising across domains without sacrificing performance. Code will be
released upon publication.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces vismap, an unsupervised video summarization system using meta-prompting and llms to generate pseudo-summaries of long videos from short clip descriptions, achieving performance comparable to supervised methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了vismap，一种无监督视频摘要系统，它使用元提示和大型语言模型从短视频片段描述生成长视频的伪摘要，性能与监督方法相当。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15921v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jian Hu, Dimitrios Korkinof, Shaogang Gong, Mariano Beguerisse-Diaz</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">VLM-based Prompts as the Optimal Assistant for Unpaired Histopathology Virtual Staining</h2>
            <p class="paper-summary">In histopathology, tissue sections are typically stained using common H&E
staining or special stains (MAS, PAS, PASM, etc.) to clearly visualize specific
tissue structures. The rapid advancement of deep learning offers an effective
solution for generating virtually stained images, significantly reducing the
time and labor costs associated with traditional histochemical staining.
However, a new challenge arises in separating the fundamental visual
characteristics of tissue sections from the visual differences induced by
staining agents. Additionally, virtual staining often overlooks essential
pathological knowledge and the physical properties of staining, resulting in
only style-level transfer. To address these issues, we introduce, for the first
time in virtual staining tasks, a pathological vision-language large model
(VLM) as an auxiliary tool. We integrate contrastive learnable prompts,
foundational concept anchors for tissue sections, and staining-specific concept
anchors to leverage the extensive knowledge of the pathological VLM. This
approach is designed to describe, frame, and enhance the direction of virtual
staining. Furthermore, we have developed a data augmentation method based on
the constraints of the VLM. This method utilizes the VLM's powerful image
interpretation capabilities to further integrate image style and structural
information, proving beneficial in high-precision pathological diagnostics.
Extensive evaluations on publicly available multi-domain unpaired staining
datasets demonstrate that our method can generate highly realistic images and
enhance the accuracy of downstream tasks, such as glomerular detection and
segmentation. Our code is available at:
https://github.com/CZZZZZZZZZZZZZZZZZ/VPGAN-HARBOR</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a novel virtual staining method for histopathology images using a vision-language large model (vlm) with contrastive learnable prompts and vlm-based data augmentation, achieving realistic image generation and improved downstream task performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种新颖的组织病理学图像虚拟染色方法，该方法使用视觉-语言大模型（vlm），结合对比学习提示和基于vlm的数据增强，实现了逼真的图像生成和下游任务性能的提升。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15545v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zizhi Chen, Xinyu Zhang, Minghao Han, Yizhou Liu, Ziyun Qian, Weifeng Zhang, Xukun Zhang, Jingwei Wei, Lihua Zhang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">InstaRevive: One-Step Image Enhancement via Dynamic Score Matching</h2>
            <p class="paper-summary">Image enhancement finds wide-ranging applications in real-world scenarios due
to complex environments and the inherent limitations of imaging devices. Recent
diffusion-based methods yield promising outcomes but necessitate prolonged and
computationally intensive iterative sampling. In response, we propose
InstaRevive, a straightforward yet powerful image enhancement framework that
employs score-based diffusion distillation to harness potent generative
capability and minimize the sampling steps. To fully exploit the potential of
the pre-trained diffusion model, we devise a practical and effective diffusion
distillation pipeline using dynamic control to address inaccuracies in updating
direction during score matching. Our control strategy enables a dynamic
diffusing scope, facilitating precise learning of denoising trajectories within
the diffusion model and ensuring accurate distribution matching gradients
during training. Additionally, to enrich guidance for the generative power, we
incorporate textual prompts via image captioning as auxiliary conditions,
fostering further exploration of the diffusion model. Extensive experiments
substantiate the efficacy of our framework across a diverse array of
challenging tasks and datasets, unveiling the compelling efficacy and
efficiency of InstaRevive in delivering high-quality and visually appealing
results. Code is available at https://github.com/EternalEvan/InstaRevive.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces instarevive, a one-step image enhancement framework using score-based diffusion distillation with dynamic control and textual prompts to achieve efficient and high-quality results.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种名为instarevive的图像增强框架，该框架采用基于分数的扩散蒸馏，结合动态控制和文本提示，实现高效高质量的图像增强。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15513v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yixuan Zhu, Haolin Wang, Ao Li, Wenliang Zhao, Yansong Tang, Jingxuan Niu, Lei Chen, Jie Zhou, Jiwen Lu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">MirrorVerse: Pushing Diffusion Models to Realistically Reflect the World</h2>
            <p class="paper-summary">Diffusion models have become central to various image editing tasks, yet they
often fail to fully adhere to physical laws, particularly with effects like
shadows, reflections, and occlusions. In this work, we address the challenge of
generating photorealistic mirror reflections using diffusion-based generative
models. Despite extensive training data, existing diffusion models frequently
overlook the nuanced details crucial to authentic mirror reflections. Recent
approaches have attempted to resolve this by creating synhetic datasets and
framing reflection generation as an inpainting task; however, they struggle to
generalize across different object orientations and positions relative to the
mirror. Our method overcomes these limitations by introducing key augmentations
into the synthetic data pipeline: (1) random object positioning, (2) randomized
rotations, and (3) grounding of objects, significantly enhancing generalization
across poses and placements. To further address spatial relationships and
occlusions in scenes with multiple objects, we implement a strategy to pair
objects during dataset generation, resulting in a dataset robust enough to
handle these complex scenarios. Achieving generalization to real-world scenes
remains a challenge, so we introduce a three-stage training curriculum to
develop the MirrorFusion 2.0 model to improve real-world performance. We
provide extensive qualitative and quantitative evaluations to support our
approach. The project page is available at: https://mirror-verse.github.io/.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces mirrorverse, a diffusion model trained with a novel synthetic data generation pipeline and a three-stage training curriculum to improve the realism of mirror reflections, addressing limitations of existing models in handling object orientations, positions, and occlusions in complex scenes.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了mirrorverse，一个扩散模型，通过新颖的合成数据生成流程和三阶段训练课程进行训练，以提高镜面反射的真实感，解决了现有模型在处理复杂场景中物体方向、位置和遮挡方面的局限性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15397v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ankit Dhiman, Manan Shah, R Venkatesh Babu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">FaceInsight: A Multimodal Large Language Model for Face Perception</h2>
            <p class="paper-summary">Recent advances in multimodal large language models (MLLMs) have demonstrated
strong capabilities in understanding general visual content. However, these
general-domain MLLMs perform poorly in face perception tasks, often producing
inaccurate or misleading responses to face-specific queries. To address this
gap, we propose FaceInsight, the versatile face perception MLLM that provides
fine-grained facial information. Our approach introduces visual-textual
alignment of facial knowledge to model both uncertain dependencies and
deterministic relationships among facial information, mitigating the
limitations of language-driven reasoning. Additionally, we incorporate face
segmentation maps as an auxiliary perceptual modality, enriching the visual
input with localized structural cues to enhance semantic understanding.
Comprehensive experiments and analyses across three face perception tasks
demonstrate that FaceInsight consistently outperforms nine compared MLLMs under
both training-free and fine-tuned settings.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: faceinsight, a new multimodal large language model, is proposed to address the poor performance of general mllms in face perception tasks by incorporating visual-textual alignment of facial knowledge and face segmentation maps as an auxiliary modality, demonstrating superior performance in experiments.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: faceinsight是一种新型多模态大型语言模型，旨在解决通用 mllm 在面部感知任务中的不良表现，通过结合面部知识的视觉-文本对齐和面部分割图作为辅助模态，实验表明该模型具有优越的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15624v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jingzhi Li, Changjiang Luo, Ruoyu Chen, Hua Zhang, Wenqi Ren, Jianhou Gan, Xiaochun Cao</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">MVQA: Mamba with Unified Sampling for Efficient Video Quality Assessment</h2>
            <p class="paper-summary">The rapid growth of long-duration, high-definition videos has made efficient
video quality assessment (VQA) a critical challenge. Existing research
typically tackles this problem through two main strategies: reducing model
parameters and resampling inputs. However, light-weight Convolution Neural
Networks (CNN) and Transformers often struggle to balance efficiency with high
performance due to the requirement of long-range modeling capabilities.
Recently, the state-space model, particularly Mamba, has emerged as a promising
alternative, offering linear complexity with respect to sequence length.
Meanwhile, efficient VQA heavily depends on resampling long sequences to
minimize computational costs, yet current resampling methods are often weak in
preserving essential semantic information. In this work, we present MVQA, a
Mamba-based model designed for efficient VQA along with a novel Unified
Semantic and Distortion Sampling (USDS) approach. USDS combines semantic patch
sampling from low-resolution videos and distortion patch sampling from
original-resolution videos. The former captures semantically dense regions,
while the latter retains critical distortion details. To prevent computation
increase from dual inputs, we propose a fusion mechanism using pre-defined
masks, enabling a unified sampling strategy that captures both semantic and
quality information without additional computational burden. Experiments show
that the proposed MVQA, equipped with USDS, achieve comparable performance to
state-of-the-art methods while being $2\times$ as fast and requiring only $1/5$
GPU memory.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces mvqa, a mamba-based video quality assessment model with a unified semantic and distortion sampling strategy (usds) for improved efficiency and performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 这篇论文介绍了mvqa，一个基于mamba的视频质量评估模型，采用统一的语义和失真采样策略(usds)，旨在提高效率和性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16003v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yachun Mi, Yu Li, Weicheng Meng, Chaofeng Chen, Chen Hui, Shaohui Liu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">AffordanceSAM: Segment Anything Once More in Affordance Grounding</h2>
            <p class="paper-summary">Improving the generalization ability of an affordance grounding model to
recognize regions for unseen objects and affordance functions is crucial for
real-world application. However, current models are still far away from such
standards. To address this problem, we introduce AffordanceSAM, an effective
approach that extends SAM's generalization capacity to the domain of affordance
grounding. For the purpose of thoroughly transferring SAM's robust performance
in segmentation to affordance, we initially propose an affordance-adaption
module in order to help modify SAM's segmentation output to be adapted to the
specific functional regions required for affordance grounding. We concurrently
make a coarse-to-fine training recipe to make SAM first be aware of affordance
objects and actions coarsely, and then be able to generate affordance heatmaps
finely. Both quantitative and qualitative experiments show the strong
generalization capacity of our AffordanceSAM, which not only surpasses previous
methods under AGD20K benchmark but also shows evidence to handle the task with
novel objects and affordance functions.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces affordancesam, an approach that extends the segment anything model (sam) for improved affordance grounding, demonstrating strong generalization on unseen objects and affordance functions.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了affordancesam，该方法扩展了segment anything model (sam)，以改进可供性接地，并在未见过的对象和可供性函数上表现出强大的泛化能力。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15650v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Dengyang Jiang, Mengmeng Wang, Teli Ma, Hengzhuang Li, Yong liu, Guang Dai, Lei Zhang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.1500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">RepNet-VSR: Reparameterizable Architecture for High-Fidelity Video Super-Resolution</h2>
            <p class="paper-summary">As a fundamental challenge in visual computing, video super-resolution (VSR)
focuses on reconstructing highdefinition video sequences from their degraded
lowresolution counterparts. While deep convolutional neural networks have
demonstrated state-of-the-art performance in spatial-temporal super-resolution
tasks, their computationally intensive nature poses significant deployment
challenges for resource-constrained edge devices, particularly in real-time
mobile video processing scenarios where power efficiency and latency
constraints coexist. In this work, we propose a Reparameterizable Architecture
for High Fidelity Video Super Resolution method, named RepNet-VSR, for
real-time 4x video super-resolution. On the REDS validation set, the proposed
model achieves 27.79 dB PSNR when processing 180p to 720p frames in 103 ms per
10 frames on a MediaTek Dimensity NPU. The competition results demonstrate an
excellent balance between restoration quality and deployment efficiency. The
proposed method scores higher than the previous champion algorithm of MAI video
super-resolution challenge.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: repnet-vsr presents a reparameterizable architecture for real-time 4x video super-resolution, achieving a good balance between restoration quality and deployment efficiency on edge devices.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: repnet-vsr提出了一种可重参数化的架构，用于实时4倍视频超分辨率，实现了边缘设备上恢复质量和部署效率之间的良好平衡。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15649v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Biao Wu, Diankai Zhang, Shaoli Liu, Si Gao, Chengjian Zheng, Ning Wang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.2000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Unifying Image Counterfactuals and Feature Attributions with Latent-Space Adversarial Attacks</h2>
            <p class="paper-summary">Counterfactuals are a popular framework for interpreting machine learning
predictions. These what if explanations are notoriously challenging to create
for computer vision models: standard gradient-based methods are prone to
produce adversarial examples, in which imperceptible modifications to image
pixels provoke large changes in predictions. We introduce a new,
easy-to-implement framework for counterfactual images that can flexibly adapt
to contemporary advances in generative modeling. Our method, Counterfactual
Attacks, resembles an adversarial attack on the representation of the image
along a low-dimensional manifold. In addition, given an auxiliary dataset of
image descriptors, we show how to accompany counterfactuals with feature
attribution that quantify the changes between the original and counterfactual
images. These importance scores can be aggregated into global counterfactual
explanations that highlight the overall features driving model predictions.
While this unification is possible for any counterfactual method, it has
particular computational efficiency for ours. We demonstrate the efficacy of
our approach with the MNIST and CelebA datasets.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces a new method, counterfactual attacks, for generating counterfactual images using latent-space adversarial attacks and demonstrates its ability to provide feature attribution, unifying counterfactual explanations for computer vision models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为“反事实攻击”的新方法，该方法使用潜在空间对抗性攻击生成反事实图像，并展示了其提供特征归因的能力，从而统一了针对计算机视觉模型的反事实解释。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15479v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jeremy Goldwasser, Giles Hooker</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">LongPerceptualThoughts: Distilling System-2 Reasoning for System-1 Perception</h2>
            <p class="paper-summary">Recent reasoning models through test-time scaling have demonstrated that long
chain-of-thoughts can unlock substantial performance boosts in hard reasoning
tasks such as math and code. However, the benefit of such long thoughts for
system-2 reasoning is relatively less explored in other domains such as
perceptual tasks where shallower, system-1 reasoning seems sufficient. In this
paper, we introduce LongPerceptualThoughts, a new synthetic dataset with 30K
long-thought traces for perceptual tasks. The key challenges in synthesizing
elaborate reasoning thoughts for perceptual tasks are that off-the-shelf models
are not yet equipped with such thinking behavior and that it is not
straightforward to build a reliable process verifier for perceptual tasks.
Thus, we propose a novel three-stage data synthesis framework that first
synthesizes verifiable multiple-choice questions from dense image descriptions,
then extracts simple CoTs from VLMs for those verifiable problems, and finally
expands those simple thoughts to elaborate long thoughts via frontier reasoning
models. In controlled experiments with a strong instruction-tuned 7B model, we
demonstrate notable improvements over existing visual reasoning data-generation
methods. Our model, trained on the generated dataset, achieves an average +3.4
points improvement over 5 vision-centric benchmarks, including +11.8 points on
V$^*$ Bench. Notably, despite being tuned for vision tasks, it also improves
performance on the text reasoning benchmark, MMLU-Pro, by +2 points.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces longperceptualthoughts, a synthetic dataset with long-reasoning traces for perceptual tasks, and demonstrates its effectiveness in improving performance on vision and text reasoning benchmarks using a three-stage data synthesis framework.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 longperceptualthoughts，一个带有长推理轨迹的感知任务合成数据集，并展示了其通过三阶段数据合成框架提高视觉和文本推理基准性能的有效性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.15362v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yuan-Hong Liao, Sven Elflein, Liu He, Laura Leal-Taixé, Yejin Choi, Sanja Fidler, David Acuna</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.3, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Classification of Firn Data via Topological Features</h2>
            <p class="paper-summary">In this paper we evaluate the performance of topological features for
generalizable and robust classification of firn image data, with the broader
goal of understanding the advantages, pitfalls, and trade-offs in topological
featurization. Firn refers to layers of granular snow within glaciers that
haven't been compressed into ice. This compactification process imposes
distinct topological and geometric structure on firn that varies with depth
within the firn column, making topological data analysis (TDA) a natural choice
for understanding the connection between depth and structure. We use two
classes of topological features, sublevel set features and distance transform
features, together with persistence curves, to predict sample depth from
microCT images. A range of challenging training-test scenarios reveals that no
one choice of method dominates in all categories, and uncoveres a web of
trade-offs between accuracy, interpretability, and generalizability.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper explores using topological data analysis (tda) techniques, specifically sublevel set features, distance transform features, and persistence curves, to classify firn image data and predict sample depth. it highlights trade-offs between accuracy, interpretability, and generalizability across different methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文探讨了使用拓扑数据分析 (tda) 技术，特别是亚水平集特征、距离变换特征和持久性曲线，来分类粒雪图像数据并预测样本深度。它强调了不同方法在准确性、可解释性和泛化能力之间的权衡。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(5/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.16150v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Sarah Day, Jesse Dimino, Matt Jester, Kaitlin Keegan, Thomas Weighill</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-04-28 05:34:22 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>